*===============================================================================
*=== FILE 3 --- SAMPLE SELECTION
*===============================================================================
clear
set more off

cap cd ""


// NJ SC NC removed due to insufficient observations in replication pseudo data
foreach state in FL { // NJ SC NC

	foreach suffix in "" _FR {

		if ("`suffix'" == "") {
			use "replication output\FourState_FS_merged.dta", clear
		}
		else {
			use "replication output\FourState`suffix'_merged.dta", clear
		}

		keep if state=="`state'"
		
		gen has_index =.
		replace has_index = 1 if word_count > 0 | pair_count > 0
		replace has_index = 0 if word_count == 0 & pair_count == 0


		/* Drop before 2008q1 and after 2017q2 */
		gen date_temp = substr(listingcreationdate, 1, 10)
		gen listing_date = date(date_temp, "YMD")
		drop date_temp
		format listing_date %td
		gen listing_my = mofd(listing_date)
		gen listing_quarter = qofd(listing_date)
		gen listing_year = yofd(listing_date)
		format listing_quarter %tq
		format listing_my %tm
		drop if listing_quarter <= tq(2007q4) | listing_quarter >= tq(2017q3)

		duplicates drop

		compress


		order propertyid listingcreationdate word_count

		gsort propertyid listingcreationdate -word_count -pair_count

		duplicates drop propertyid listingcreationdate, force

		isid propertyid listingcreationdate


		gen postalcode_int = real(postalcode)
		drop if postalcode_int ==.

		* THE FOLLOWING TEXT BLOCK IS COMMENTED OUT OF THE REPLICATION PACKAGE
		* BUT IS PART OF THE REAL PREPROCESSING CODE
		* we comment it out here, because the pseudo data is fake
		* and so postalcodes and states do not align
		/*
		if "`state'"=="FL" {
			keep if postalcode_int >=32000 & postalcode_int < 35000
		}

		if "`state'"=="NC" {
			keep if postalcode_int >=27000 & postalcode_int < 29000
		}

		if "`state'"=="NJ" {
			keep if postalcode_int >=7000 & postalcode_int < 9000
		}

		if "`state'"=="SC" {
			keep if postalcode_int >=29000 & postalcode_int < 30000
		}
		*/

		save "replication output/`state'_masterfile`suffix'.dta", replace

	}

}

